数値データ
X1
df.train %>%
dplyr::mutate(y = (y == "1")) %>%
ggplot(aes(X1)) +
geom_density(aes(fill = y), alpha = 1/2) +
scale_x_log10()

X12
df.train %>%
dplyr::mutate(y = (y == "1")) %>%
ggplot(aes(X12)) +
geom_density(aes(fill = y), alpha = 1/2)

df.train %>%
dplyr::mutate(
y = (y == "1"),
ratio = X12 / X1
) %>%
ggplot(aes(ratio)) +
geom_density(aes(fill = y), alpha = 1/2)

X13
df.train %>%
dplyr::mutate(y = (y == "1")) %>%
ggplot(aes(X13)) +
geom_density(aes(fill = y), alpha = 1/2)

df.train %>%
dplyr::mutate(
y = (y == "1"),
ratio = X13 / X1
) %>%
ggplot(aes(ratio)) +
geom_density(aes(fill = y), alpha = 1/2)

X14
df.train %>%
dplyr::mutate(y = (y == "1")) %>%
ggplot(aes(X14)) +
geom_density(aes(fill = y), alpha = 1/2)

df.train %>%
dplyr::mutate(
y = (y == "1"),
ratio = X14 / X1
) %>%
ggplot(aes(ratio)) +
geom_density(aes(fill = y), alpha = 1/2)

X15
df.train %>%
dplyr::mutate(y = (y == "1")) %>%
ggplot(aes(X15)) +
geom_density(aes(fill = y), alpha = 1/2)

df.train %>%
dplyr::mutate(
y = (y == "1"),
ratio = X15 / X1
) %>%
ggplot(aes(ratio)) +
geom_density(aes(fill = y), alpha = 1/2)

X16
df.train %>%
dplyr::mutate(y = (y == "1")) %>%
ggplot(aes(X16)) +
geom_density(aes(fill = y), alpha = 1/2)

df.train %>%
dplyr::mutate(
y = (y == "1"),
ratio = X16 / X1
) %>%
ggplot(aes(ratio)) +
geom_density(aes(fill = y), alpha = 1/2)

X17
df.train %>%
dplyr::mutate(y = (y == "1")) %>%
ggplot(aes(X17)) +
geom_density(aes(fill = y), alpha = 1/2)

df.train %>%
dplyr::mutate(
y = (y == "1"),
ratio = X17 / X1
) %>%
ggplot(aes(ratio)) +
geom_density(aes(fill = y), alpha = 1/2)

X18
df.train %>%
dplyr::mutate(y = (y == "1")) %>%
ggplot(aes(X18)) +
geom_density(aes(fill = y), alpha = 1/2) +
scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 4717 rows containing non-finite values (stat_density).

df.train %>%
dplyr::mutate(
y = (y == "1"),
ratio = X18 / X1
) %>%
ggplot(aes(ratio)) +
geom_density(aes(fill = y), alpha = 1/2) +
scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 4717 rows containing non-finite values (stat_density).

X19
df.train %>%
dplyr::mutate(y = (y == "1")) %>%
ggplot(aes(X19)) +
geom_density(aes(fill = y), alpha = 1/2) +
scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 4842 rows containing non-finite values (stat_density).

df.train %>%
dplyr::mutate(
y = (y == "1"),
ratio = X19 / X1
) %>%
ggplot(aes(ratio)) +
geom_density(aes(fill = y), alpha = 1/2) +
scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 4842 rows containing non-finite values (stat_density).

X20
df.train %>%
dplyr::mutate(y = (y == "1")) %>%
ggplot(aes(X20)) +
geom_density(aes(fill = y), alpha = 1/2) +
scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 5332 rows containing non-finite values (stat_density).

df.train %>%
dplyr::mutate(
y = (y == "1"),
ratio = X20 / X1
) %>%
ggplot(aes(ratio)) +
geom_density(aes(fill = y), alpha = 1/2) +
scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 5332 rows containing non-finite values (stat_density).

X21
df.train %>%
dplyr::mutate(y = (y == "1")) %>%
ggplot(aes(X21)) +
geom_density(aes(fill = y), alpha = 1/2) +
scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 5760 rows containing non-finite values (stat_density).

df.train %>%
dplyr::mutate(
y = (y == "1"),
ratio = X21/ X1
) %>%
ggplot(aes(ratio)) +
geom_density(aes(fill = y), alpha = 1/2) +
scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 5760 rows containing non-finite values (stat_density).

X22
df.train %>%
dplyr::mutate(y = (y == "1")) %>%
ggplot(aes(X22)) +
geom_density(aes(fill = y), alpha = 1/2) +
scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 6002 rows containing non-finite values (stat_density).

df.train %>%
dplyr::mutate(
y = (y == "1"),
ratio = X22 / X1
) %>%
ggplot(aes(ratio)) +
geom_density(aes(fill = y), alpha = 1/2) +
scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 6002 rows containing non-finite values (stat_density).

X23
df.train %>%
dplyr::mutate(y = (y == "1")) %>%
ggplot(aes(X23)) +
geom_density(aes(fill = y), alpha = 1/2) +
scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 6420 rows containing non-finite values (stat_density).

df.train %>%
dplyr::mutate(
y = (y == "1"),
ratio = X23 / X1
) %>%
ggplot(aes(ratio)) +
geom_density(aes(fill = y), alpha = 1/2) +
scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 6420 rows containing non-finite values (stat_density).

X18〜X23 の和
df.train %>%
dplyr::mutate(
y = (y == "1"),
ratio = (X18 + X19 + X20 + X21 + X22 + X23) / X1
) %>%
#summary()
ggplot(aes(ratio)) +
geom_density(aes(fill = y), alpha = 1/2) +
scale_x_log10()
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Removed 1288 rows containing non-finite values (stat_density).

カテゴリ値
df.category_summary <- df.train %>%
dplyr::mutate(y = (y == "1")) %>%
dplyr::select_if(function(x) { is.character(x) | is.factor(x) | is.logical(x) }) %>%
tidyr::gather(key = "variable", value = "category", -y) %>%
dplyr::group_by(variable, category) %>%
dplyr::summarise(
count = n(),
success_ratio = mean(y)
) %>%
dplyr::mutate(count_ratio = count / sum(count)) %>%
dplyr::ungroup()
## Warning: attributes are not identical across measure variables;
## they will be dropped
単一変数
構成比
df.category_summary %>%
dplyr::mutate(
variable = forcats::fct_relevel(variable, stringr::str_c("X", c(2:4, 6:11), sep = "")),
category = factor(category, levels = -2:8)
) %>%
ggplot(aes(category, count_ratio)) +
geom_col(aes(fill = variable), show.legend = F) +
scale_y_continuous(limits = c(0, 1), labels = scales::percent) +
labs(
x = NULL,
y = NULL
) +
facet_grid(. ~ variable, scales = "free_x", space = "free_x")

デフォルト率
df.category_summary %>%
dplyr::mutate(
variable = forcats::fct_relevel(variable, stringr::str_c("X", c(2:4, 6:11), sep = "")),
category = factor(category, levels = -2:8)
) %>%
ggplot(aes(category, success_ratio)) +
geom_col(aes(fill = variable), show.legend = F) +
geom_hline(yintercept = mean(dplyr::mutate(df.train, y = (y == "1"))$y), linetype = 2, size = 1.1, alpha = 2/3) +
scale_y_continuous(limits = c(0, 1), labels = scales::percent) +
labs(
x = NULL,
y = NULL
) +
facet_grid(. ~ variable, scales = "free_x", space = "free_x")

2 変数の掛け合わせ
midpoint <- mean(dplyr::mutate(df.train, y=(y=="1"))$y)
X2 ~ X3
df.train %>%
dplyr::mutate(
y = (y == "1")
) %>%
dplyr::group_by(X2, X3) %>%
dplyr::summarise(
n = n(),
ratio = mean(y)
) %>%
ggplot(aes(X2, X3)) +
geom_tile(aes(fill = ratio), colour = "black") +
geom_label(aes(label = scales::percent(ratio))) +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = midpoint)

X2 ~ X4
df.train %>%
dplyr::mutate(
y = (y == "1")
) %>%
dplyr::group_by(X2, X4) %>%
dplyr::summarise(
n = n(),
ratio = mean(y)
) %>%
ggplot(aes(X2, X4)) +
geom_tile(aes(fill = ratio), colour = "black") +
geom_label(aes(label = scales::percent(ratio))) +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = midpoint)

X3 ~ X4
df.train %>%
dplyr::mutate(
y = (y == "1")
) %>%
dplyr::group_by(X3, X4) %>%
dplyr::summarise(
n = n(),
ratio = mean(y)
) %>%
ggplot(aes(X3, X4)) +
geom_tile(aes(fill = ratio), colour = "black") +
geom_label(aes(label = scales::percent(ratio))) +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = midpoint)

X5(年齢)
ヒストグラム
df.train %>%
ggplot(aes(X5)) +
geom_histogram(binwidth = 5, colour = "white")

df.train %>%
dplyr::mutate(y = (y == "1")) %>%
dplyr::mutate(
age_segment = dplyr::case_when(
X5 <= 20 ~ "lte20",
X5 <= 30 ~ "lte30",
X5 <= 40 ~ "lte40",
X5 <= 50 ~ "lte50",
X5 <= 60 ~ "lte60",
X5 <= 70 ~ "lte70",
X5 <= 80 ~ "lte80",
T ~ "other"
) %>% forcats::fct_relevel(stringr::str_c("lte", seq(30, 80, 10)))
) %>%
dplyr::group_by(age_segment) %>%
dplyr::summarise(
n = n(),
default_ratio = mean(y)
) %>%
ggplot(aes(age_segment, default_ratio)) +
geom_point(aes(size = n)) +
scale_size_area() +
geom_line(aes(group = 1)) +
geom_hline(yintercept = midpoint, linetype = 2, size = 1.1, colour = "red") +
scale_y_continuous(labels = scales::percent, limits = c(0, 0.4))

X2 ~ X5
df.train %>%
dplyr::mutate(y = (y == "1")) %>%
dplyr::mutate(
age_segment = dplyr::case_when(
X5 <= 20 ~ "lte20",
X5 <= 30 ~ "lte30",
X5 <= 40 ~ "lte40",
X5 <= 50 ~ "lte50",
X5 <= 60 ~ "lte60",
X5 <= 70 ~ "lte70",
X5 <= 80 ~ "lte80",
T ~ "other"
) %>% forcats::fct_relevel(stringr::str_c("lte", seq(30, 80, 10)))
) %>%
dplyr::group_by(X2, age_segment) %>%
dplyr::summarise(
n = n(),
ratio = mean(y)
) %>%
ggplot(aes(age_segment, X2)) +
geom_tile(aes(fill = ratio), colour = "black") +
geom_label(aes(label = scales::percent(ratio))) +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = midpoint)

X3 ~ X5
df.train %>%
dplyr::mutate(y = (y == "1")) %>%
dplyr::mutate(
age_segment = dplyr::case_when(
X5 <= 20 ~ "lte20",
X5 <= 30 ~ "lte30",
X5 <= 40 ~ "lte40",
X5 <= 50 ~ "lte50",
X5 <= 60 ~ "lte60",
X5 <= 70 ~ "lte70",
X5 <= 80 ~ "lte80",
T ~ "other"
) %>% forcats::fct_relevel(stringr::str_c("lte", seq(30, 80, 10)))
) %>%
dplyr::group_by(X3, age_segment) %>%
dplyr::summarise(
n = n(),
ratio = mean(y)
) %>%
ggplot(aes(age_segment, X3)) +
geom_tile(aes(fill = ratio), colour = "black") +
geom_label(aes(label = scales::percent(ratio))) +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = midpoint)

X4 ~ X5
df.train %>%
dplyr::mutate(y = (y == "1")) %>%
dplyr::mutate(
age_segment = dplyr::case_when(
X5 <= 20 ~ "lte20",
X5 <= 30 ~ "lte30",
X5 <= 40 ~ "lte40",
X5 <= 50 ~ "lte50",
X5 <= 60 ~ "lte60",
X5 <= 70 ~ "lte70",
X5 <= 80 ~ "lte80",
T ~ "other"
) %>% forcats::fct_relevel(stringr::str_c("lte", seq(30, 80, 10)))
) %>%
dplyr::group_by(X4, age_segment) %>%
dplyr::summarise(
n = n(),
ratio = mean(y)
) %>%
ggplot(aes(age_segment, X4)) +
geom_tile(aes(fill = ratio), colour = "black") +
geom_label(aes(label = scales::percent(ratio))) +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = midpoint)

X6 ~ X7
df.train %>%
dplyr::mutate(
y = (y == "1"),
X6 = forcats::fct_collapse(X6,
others = c("3", "4", "5", "6", "7", "8")
),
X7 = forcats::fct_collapse(X7,
others = c("3", "4", "5", "6", "7", "8")
)
) %>%
dplyr::group_by(X6, X7) %>%
dplyr::summarise(
n = n(),
ratio = mean(y)
) %>%
ggplot(aes(X6, X7)) +
geom_tile(aes(fill = ratio), colour = "black") +
geom_label(aes(label = scales::percent(ratio))) +
scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = midpoint)
